Lots of data exploration inspiration from Michael Griffiths: https://www.kaggle.com/msjgriffiths/exploratory-analysis/code

# Load data
animes <- read.csv('../data/clean/animes.csv', header = TRUE, stringsAsFactors = FALSE)
genres <- read.csv('../data/clean/genres.csv', header = TRUE, stringsAsFactors = FALSE)
ratings <- read.csv('../data/raw/no_null_ratings.csv', header = TRUE)

Let’s look at the top anime by avg rating

N <- 10
df <- data.frame(Anime = animes$name, Rating = animes$rating, stringsAsFactors = TRUE) # want names as factors for plotting

df <- df[order(df$Rating, decreasing = TRUE), ]  # sort by ranking
df$Anime <- factor(df$Anime, levels = df$Anime)  # to retain the order in plot
df <- df[1:N,] # cut off the top N

df %>% ggplot(aes(x=Anime, y=Rating)) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  coord_cartesian(ylim = c(9.0, 10.0)) +
  labs(title="Top Anime Ratings",
       caption="source: MAL dataset") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

Let’s explore the distribution of ratings among some different groupings

First, does type (Movie, TV show,…) matter?

animes %>% 
  ggplot(aes(rating, group = type)) +
  labs(title = "Average Anime Ratings by Type") +
  geom_density(aes(fill = type), alpha = .4) +
  xlim(0, 10)
## Warning: Removed 230 rows containing non-finite values (stat_density).

Looks like it does! Let’s look more closely at the variation between types

animes %>% 
  group_by(type) %>% 
  summarise(
    average.viewers = mean(members),
    sd.viewers = sd(members),
    average.rating = mean(rating, na.rm = T),
    sd.rating = sd(rating, na.rm = T)
  ) %>% 
  formattable()
type average.viewers sd.viewers average.rating sd.rating
6537.400 13278.495 NaN NA
Movie 10369.094 30898.076 6.318414 1.2119725
Music 1311.840 4548.136 5.588996 0.9584401
ONA 4114.030 12399.959 5.643298 1.1270907
OVA 5986.140 15026.128 6.375221 0.8583584
Special 7676.061 15546.290 6.523501 0.8877620
TV 42683.658 89121.009 6.902299 0.8635256

Explore the genres too!

animes_subset_ratings <- animes %>% select("anime_id", "rating")
genres_with_ratings <- inner_join(genres, animes_subset_ratings, by = "anime_id")

g <- genres_with_ratings %>% 
  ggplot(aes(rating, group = genre)) +
  geom_density(aes(fill = genre), alpha = .4)

ggplotly(g)
## Warning: Removed 690 rows containing non-finite values (stat_density).

The Dementia genre is very unpopular. The Harem genre has a very high thin peak, indicating little variance. Maybe they’re all basically the same?

In our dataset, what’s the distribution of user ratings?

g <- ratings %>% ggplot(aes(x = factor(rating))) +
  geom_bar() + labs(title = "Distribution of ratings")

ggplotly(g)

What about the distribution of users’ average rating?

g <- ratings %>% 
  group_by(user_id) %>% 
  summarise(m = mean(rating)) %>% 
  ggplot(aes(m)) +
  geom_density() +
  labs(title = "Distribution of average rating over users")

ggplotly(g)

School days is a notoriously controversial anime. Is the variance of users’ ratings of this anime higher than it is for most anime?

school_days.id <- filter(animes, name == "School Days")$anime_id
school_days.ratings <- filter(ratings, anime_id == school_days.id)
school_days.sd <- sd(school_days.ratings$rating)
print(school_days.sd)
## [1] 2.354353
summary(ratings$rating)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   7.000   8.000   7.808   9.000  10.000
ratings.sd <- sd(ratings$rating)
print(ratings.sd)
## [1] 1.572496

Yes it is! Let’s visualize that

#school_days.ratings %>% ggplot(aes(x = bin, y = ..density.., group = source, fill = source)) +
#    geom_bar(alpha = 0.5, position = 'identity')

g <- school_days.ratings %>% ggplot(aes(x = rating))

g + geom_density()

g + geom_bar() # Bar plot

ggplotly(g)
# Create a fake grouping variable, for a boxplot of 1 dim
g + geom_violin(aes(x = factor(0), y = rating), trim = FALSE, adjust = 2) + 
  xlab("") + scale_x_discrete(breaks = NULL)

# Yeah, looks like the # Let’s compute class rankings to find the true Weebs

# Weeb score
MAX_SCORE <- 10 # We assume students would give their favorite animes this score
MIN_SCORE <- 1 # We assume students would give their least fav animes this score

students <- c("Adriana", "Beau", "David", "Fanny", "Joe", "Kevin", "Lilly (Ralf)", "Lydia", "Mac", "Michael", "Noah", "Richard", "Roger", "Saad", "Shane", "Stephanie", "Ty", "Xiaotai")
fav_animes <- c("Psycho-Pass", "One Punch Man", "Cowboy Bebop", "", "FLCL", "Death Note", "Last Exile", "JoJo no Kimyou na Bouken (TV)", "Pokemon", "Tonari no Totoro", "Ginga Eiyuu Densetsu", "Afro Samurai", "Yuri!!! on Ice", "Dragon Ball Z", "JoJo no Kimyou na Bouken: Diamond wa Kudakenai", "Ouran Koukou Host Club", "Mushishi", "Doraemon (1979)")

calculate_culture_score <- function (fav_anime) {
  anime_from_data <- filter(animes, name == fav_anime)
  if (nrow(anime_from_data) == 0) { # no result
    rating <- MIN_SCORE
  } else {
    rating <- anime_from_data$rating
  }
  culture.score <- (MAX_SCORE - rating)^2
  #if (culture.score < 1) {
    #print("Ah, I see you're a man of culture as well.")
  #}
  return(culture.score)
}

weeb.scores <- sapply(fav_animes, calculate_culture_score)

df <- data.frame(Student = students, Score = weeb.scores, stringsAsFactors = TRUE) # want names as factors for plotting
  
df <- df[order(df$Score), ,]  # sort by ranking
df$Student <- factor(df$Student, levels = df$Student)  # to retain the order in plot

g <- ggplot(df, aes(x=Student, y=Score, text = paste("Anime: ", rownames(df)))) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Class Rankings", 
       subtitle="Culture Score", 
       caption="source: In-Class Survey") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

ggplotly(g, tooltip = c("text", "x"))

The scale is thrown off by one NA. Let’s try again removing the outlier.

df <- df[1:nrow(df)-1,] # chop off last row
g <- ggplot(df, aes(x=Student, y=Score, text = paste("Anime: ", rownames(df)))) + 
  geom_bar(stat="identity", width=.5, fill="tomato3") + 
  labs(title="Class Rankings", 
       subtitle="Culture Score", 
       caption="source: In-Class Survey") + 
  theme(axis.text.x = element_text(angle=65, vjust=0.6))

ggplotly(g, tooltip = c("text", "x"))